library("dplyr")
df_va_original <- read.csv("./IHME_PHMRC_VA_DATA_CHILD_Y2013M09D11_0.csv")
Some of the columns were dropped as they are irrelevant/duplicated or not informative (for example, the frequency of the words doesn’t provide much information without a context)
# drop columns that are not necessary
df_va <- subset(df_va_original, select = -c(gs_code34,va34,gs_code46,gs_text46,va46,gs_code55,gs_text55,va55,gs_comorbid1,gs_comorbid2,gs_level,g1_01d,g1_01m,g1_01y,g1_05,g1_06d,g1_06m,g1_06y,g1_07a,g1_07b,g1_07c,g1_08,g1_09,g1_10,g2_01,g2_02,g2_03ad,g2_03am,g2_03ay,g2_03bd,g2_03bm,g2_03by,g2_03cd,g2_03cm,g2_03cy,g2_03dd,g2_03dm,g2_03dy,g2_03ed,g2_03em,g2_03ey,g2_03fd,g2_03fm,g2_03fy,g3_01,g4_02,g4_03a,g4_03b,g4_04,g4_05,g5_05,g5_06a,g5_06b,g5_07,g5_08,word_diseas,word_final,word_child,word_condit,word_digest,word_glucos,word_bodi,word_tetanus,word_hand,word_failur,word_reduc,word_son,word_breath,word_look,word_till,word_spot,word_proper,word_medic,word_found,word_girl,word_ray,word_babi,word_privat,word_reason,word_poison,word_bring,word_renal,word_rash,word_healthi,word_interview,word_acquir,word_accid,word_test,word_cough,word_respiratori,word_mother,word_traffic,word_hospit,word_come,word_abdomen,word_abl,word_result,word_pregnanc,word_suffer,word_check,word_pass,word_famili,word_die,word_road,word_colleg,word_dengu,word_doctor,word_drown,word_tumor,word_cardio,word_eat,word_fall,word_examin,word_acidosi,word_fire,word_fit,word_sepsi,word_nilouf,word_happen,word_head,word_headach,word_skin,word_blood,word_home,word_hypertens,word_immedi,word_bad,word_inject,word_left,word_leg,word_daughter,word_malnutrit,word_milk,word_clock,word_anemia,word_nilof,word_complain,word_hour,word_nurs,word_snake,word_pain,word_pneumonia,word_polic,word_provid,word_boy,word_recov,word_malaria,word_asthma,word_explain,word_scan,word_gandhi,word_born,word_lung,word_stomach,word_difficulti,word_weak,word_client,word_time,word_told,word_transfus,word_treat,word_unconsci,word_water,word_loos,word_week,word_stool,word_ill,word_lot,word_jaundic,word_communiti,word_health,word_deliv,word_drink,word_servic,word_fine,word_eye,word_particip,word_money,word_chest,word_increas,word_live,word_expir,word_normal,word_brain,word_stay,word_urin,word_remov,word_admit,word_bite,word_center,word_measl,word_kept,word_especi,word_neck,word_serious,word_due,word_care,word_day,word_pox,word_hiv,word_icu,word_start,word_nose,word_leukemia,word_caus,word_near,word_morn,word_vomit,word_accord,word_gastric,word_receiv,word_coma,word_father,word_clinic,word_emerg,word_month,word_birth,word_treatment,word_sick,word_dehydr,word_prescrib,word_children,word_motion,word_refer,word_ward,word_certif,word_advis,word_hous,word_medicin,word_play,word_heart,word_diarrhea,word_baby,word_mouth,word_sever,word_shock,word_dead,word_oper,word_night,word_indraw,word_provinci,word_cancer,word_brought,word_even,word_convuls,word_addit,word_deceas,word_take,word_oxygen,word_infect,word_cold,word_misplac,word_swell,word_respond,word_transfer,word_thank,word_cri,word_sudden,word_continu,word_sent,word_stop,word_get,word_fever,word_notic,word_hole,word_kidney,word_bluish,word_yellow,word_injuri,word_pulmonari,c6_11,c6_12,c6_13,c6_14,newid,g5_02,module))
str(df_va)
## 'data.frame': 2064 obs. of 146 variables:
## $ site : chr "AP" "Dar" "UP" "Pemba" ...
## $ gs_text34: chr "Bite of Venomous Animal" "Malaria" "Measles" "Pneumonia" ...
## $ g4_06 : int 4 4 9 4 5 13 2 4 3 2 ...
## $ g4_07 : int 2 3 3 2 7 1 2 2 2 1 ...
## $ g4_08 : chr "No" "Yes" "Yes" "Yes" ...
## $ g5_01d : chr "20" "Don't Know" "6" "Don't Know" ...
## $ g5_01m : chr "March" "Don't Know" "May" "Don't Know" ...
## $ g5_01y : chr "2005" "2002" "2004" "Don't Know" ...
## $ g5_03d : chr "10" "7" "4" "19" ...
## $ g5_03m : chr "August" "April" "December" "March" ...
## $ g5_03y : int 2009 2009 2009 2009 2007 2009 2009 2009 2009 2009 ...
## $ g5_04a : int 4 7 5 1 6 NA NA NA NA 7 ...
## $ g5_04b : int NA NA NA NA NA 5 10 11 11 NA ...
## $ g5_04c : int NA NA NA NA NA NA NA NA NA NA ...
## $ c1_01 : chr "Multiple" "Singleton" "Singleton" "Singleton" ...
## $ c1_02 : chr "Second" "Don't Know" "Don't Know" "Don't Know" ...
## $ c1_03 : chr "" "Yes" "" "Yes" ...
## $ c1_04 : chr "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
## $ c1_05 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ c1_06a : chr "Hospital" "Hospital" "Home" "Home" ...
## $ c1_07 : chr "About average" "About average" "About average" "Very small" ...
## $ c1_08a : chr "" "Don't Know" "Grams" "Don't Know" ...
## $ c1_08b : num 3000 NA 3500 NA NA ...
## $ c1_09 : chr "Female" "Female" "Female" "Male" ...
## $ c1_10 : chr "" "Specified" "Specified" "Don't Know" ...
## $ c1_10d : chr "20" "Don't Know" "6" "" ...
## $ c1_10m : chr "March" "Don't Know" "May" "" ...
## $ c1_10y : chr "2005" "2002" "2004" "" ...
## $ c1_11 : chr "Alive" "Alive" "Alive" "Alive" ...
## $ c1_12 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ c1_13 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ c1_14 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ c1_15 : chr "No" "No" "No" "No" ...
## $ c1_16 : chr "No" "No" "No" "No" ...
## $ c1_17 : chr "No" "No" "No" "No" ...
## $ c1_18 : chr "No" "No" "No" "No" ...
## $ c1_19_1 : chr "No" "No" "No" "No" ...
## $ c1_19_2 : chr "No" "No" "No" "No" ...
## $ c1_19_3 : chr "No" "No" "No" "No" ...
## $ c1_19_4a : chr "No" "No" "No" "No" ...
## $ c1_19_4b : logi NA NA NA NA NA NA ...
## $ c1_19_5 : chr "No" "No" "No" "No" ...
## $ c1_19_6 : chr "No" "No" "No" "No" ...
## $ c1_20 : int 0 0 1825 0 0 150 300 300 270 0 ...
## $ c1_21 : int 0 30 3 8 30 7 3 30 60 0 ...
## $ c1_22a : chr "Hospital" "Hospital" "Home" "Hospital" ...
## $ c1_24 : chr "Specified" "Specified" "Specified" "Specified" ...
## $ c1_24d : chr "10" "7" "4" "19" ...
## $ c1_24m : chr "August" "April" "December" "March" ...
## $ c1_24y : chr "2009" "2009" "2009" "2009" ...
## $ c1_25 : int 0 0 1825 0 0 150 300 330 330 0 ...
## $ c1_26 : chr "28 days to 11 years" "28 days to 11 years" "28 days to 11 years" "28 days to 11 years" ...
## $ c4_01 : chr "No" "Yes" "Yes" "Yes" ...
## $ c4_02 : int 0 30 3 4 12 7 2 20 20 0 ...
## $ c4_03 : chr "No" "Yes" "Yes" "Yes" ...
## $ c4_04 : chr "Don't Know" "Severe" "Severe" "Severe" ...
## $ c4_05 : chr "Don't Know" "Continuous" "Continuous" "Continuous" ...
## $ c4_06 : chr "No" "Yes" "Yes" "No" ...
## $ c4_07a : chr "Don't Know" "Specified" "Specified" "Don't Know" ...
## $ c4_07b : int 0 4 4 0 0 3 3 10 4 0 ...
## $ c4_08 : int 0 14 0 0 0 6 1 15 7 0 ...
## $ c4_09 : chr "No" "Yes" "No" "No" ...
## $ c4_10 : int 0 0 0 0 0 5 0 15 7 0 ...
## $ c4_11 : chr "No" "No" "No" "No" ...
## $ c4_12 : chr "No" "No" "No" "No" ...
## $ c4_13 : int 0 0 0 0 0 1 0 0 0 0 ...
## $ c4_14 : chr "No" "No" "No" "No" ...
## $ c4_15 : chr "No" "No" "No" "No" ...
## $ c4_16 : chr "Yes" "No" "No" "Yes" ...
## $ c4_17 : int 0 0 0 8 0 7 0 1 4 0 ...
## $ c4_18 : chr "Yes" "No" "No" "Yes" ...
## $ c4_19 : int 0 0 0 3 0 0 0 0 1 0 ...
## $ c4_20 : chr "No" "No" "No" "Yes" ...
## $ c4_22 : chr "No" "No" "No" "No" ...
## $ c4_23 : chr "No" "No" "No" "No" ...
## $ c4_24 : chr "No" "No" "No" "Yes" ...
## $ c4_25 : chr "No" "No" "No" "No" ...
## $ c4_26 : chr "Yes" "No" "No" "No" ...
## $ c4_27 : chr "<6 hours" "Don't Know" "Don't Know" "Don't Know" ...
## $ c4_28 : chr "No" "No" "No" "Yes" ...
## $ c4_29 : chr "No" "No" "No" "Yes" ...
## $ c4_30 : chr "No" "No" "No" "No" ...
## $ c4_31_1 : chr "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
## $ c4_31_2 : chr "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
## $ c4_32 : chr "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
## $ c4_33 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ c4_34 : chr "No" "No" "No" "No" ...
## $ c4_35 : chr "No" "No" "No" "Yes" ...
## $ c4_36 : chr "Yes" "No" "No" "No" ...
## $ c4_37 : int 0 0 0 0 0 0 0 0 3 0 ...
## $ c4_38 : chr "No" "No" "No" "No" ...
## $ c4_39 : chr "No" "No" "No" "No" ...
## $ c4_40 : chr "Yes" "No" "No" "No" ...
## $ c4_41 : chr "No" "No" "No" "No" ...
## $ c4_42 : chr "No" "No" "No" "No" ...
## $ c4_43 : chr "No" "No" "No" "No" ...
## $ c4_44 : chr "No" "No" "No" "No" ...
## $ c4_45 : chr "" "" "" "" ...
## $ c4_46 : chr "No" "No" "No" "No" ...
## [list output truncated]
transform data type: change the characters into categorical variables
df_va[sapply(df_va, is.character)] <- lapply(df_va[sapply(df_va, is.character)],
as.factor)
Compute table distribution for all the categorical variables
for (i in 1:ncol(df_va)){
if (is.factor(df_va[,i])){
table <- data.frame(table(df_va[,i]))
colnames(table) <- c(names(df_va)[i],"Freq")
print(table)
}
}
## site Freq
## 1 AP 449
## 2 Bohol 262
## 3 Dar 467
## 4 Mexico 126
## 5 Pemba 261
## 6 UP 499
## gs_text34 Freq
## 1 AIDS 20
## 2 Bite of Venomous Animal 54
## 3 Diarrhea/Dysentery 256
## 4 Drowning 83
## 5 Encephalitis 41
## 6 Falls 49
## 7 Fires 68
## 8 Hemorrhagic fever 51
## 9 Malaria 116
## 10 Measles 23
## 11 Meningitis 58
## 12 Other Cancers 28
## 13 Other Cardiovascular Diseases 76
## 14 Other Defined Causes of Child Deaths 194
## 15 Other Digestive Diseases 48
## 16 Other Infectious Diseases 67
## 17 Pneumonia 532
## 18 Poisonings 18
## 19 Road Traffic 92
## 20 Sepsis 138
## 21 Violent Death 52
## g4_08 Freq
## 1 2
## 2 No 851
## 3 Yes 1211
## g5_01d Freq
## 1 182
## 2 1 57
## 3 10 82
## 4 11 55
## 5 12 64
## 6 13 37
## 7 14 40
## 8 15 47
## 9 16 50
## 10 17 62
## 11 18 54
## 12 19 51
## 13 2 57
## 14 20 49
## 15 21 34
## 16 22 58
## 17 23 40
## 18 24 42
## 19 25 37
## 20 26 49
## 21 27 50
## 22 28 46
## 23 29 43
## 24 3 57
## 25 30 42
## 26 31 30
## 27 4 58
## 28 5 57
## 29 6 47
## 30 7 53
## 31 8 59
## 32 9 61
## 33 Don't Know 314
## g5_01m Freq
## 1 182
## 2 November 89
## 3 April 156
## 4 August 161
## 5 December 114
## 6 Don't Know 259
## 7 February 139
## 8 January 123
## 9 July 142
## 10 June 146
## 11 March 154
## 12 May 133
## 13 October 142
## 14 September 124
## g5_01y Freq
## 1 182
## 2 1995 1
## 3 1996 3
## 4 1997 19
## 5 1998 71
## 6 1999 61
## 7 2000 59
## 8 2001 60
## 9 2002 60
## 10 2003 70
## 11 2004 99
## 12 2005 93
## 13 2006 124
## 14 2007 223
## 15 2008 430
## 16 2009 345
## 17 2010 14
## 18 Don't Know 150
## g5_03d Freq
## 1 1
## 2 1 50
## 3 10 59
## 4 11 74
## 5 12 72
## 6 13 56
## 7 14 82
## 8 15 70
## 9 16 91
## 10 17 76
## 11 18 52
## 12 19 63
## 13 2 77
## 14 20 67
## 15 21 72
## 16 22 69
## 17 23 76
## 18 24 62
## 19 25 59
## 20 26 73
## 21 27 53
## 22 28 76
## 23 29 47
## 24 3 62
## 25 30 65
## 26 31 32
## 27 4 64
## 28 5 77
## 29 6 75
## 30 7 65
## 31 8 71
## 32 9 70
## 33 Don't Know 6
## g5_03m Freq
## 1 November 197
## 2 April 170
## 3 August 209
## 4 December 176
## 5 Don't Know 2
## 6 February 148
## 7 January 143
## 8 July 159
## 9 June 127
## 10 March 190
## 11 May 153
## 12 October 199
## 13 September 191
## c1_01 Freq
## 1 2
## 2 Don't Know 1
## 3 Multiple 76
## 4 Singleton 1985
## c1_02 Freq
## 1 Don't Know 1990
## 2 First 30
## 3 Second 43
## 4 Third or More 1
## c1_03 Freq
## 1 946
## 2 No 70
## 3 Yes 1048
## c1_04 Freq
## 1 After 65
## 2 Don't Know 1997
## 3 During 2
## c1_06a Freq
## 1 Don't Know 15
## 2 Home 740
## 3 Hospital 1213
## 4 On Route to Health Facility 18
## 5 Other 7
## 6 Other Health Facility 71
## c1_07 Freq
## 1 4
## 2 About average 1661
## 3 Don't Know 42
## 4 larger than usual 117
## 5 smaller than usual 144
## 6 Very small 96
## c1_08a Freq
## 1 374
## 2 Don't Know 477
## 3 Grams 1207
## 4 Refused to Answer 6
## c1_09 Freq
## 1 16
## 2 Don't Know 3
## 3 Female 1216
## 4 Male 828
## 5 Refused to Answer 1
## c1_10 Freq
## 1 399
## 2 Don't Know 313
## 3 Refused to Answer 1
## 4 Specified 1351
## c1_10d Freq
## 1 325
## 2 1 59
## 3 10 80
## 4 11 58
## 5 12 62
## 6 13 36
## 7 14 41
## 8 15 47
## 9 16 51
## 10 17 63
## 11 18 55
## 12 19 53
## 13 2 57
## 14 20 50
## 15 21 34
## 16 22 58
## 17 23 39
## 18 24 42
## 19 25 39
## 20 26 47
## 21 27 51
## 22 28 46
## 23 29 43
## 24 3 59
## 25 30 41
## 26 31 31
## 27 4 58
## 28 5 60
## 29 6 46
## 30 7 54
## 31 8 58
## 32 9 63
## 33 Don't Know 158
## c1_10m Freq
## 1 325
## 2 November 94
## 3 April 159
## 4 August 161
## 5 December 114
## 6 Don't Know 104
## 7 February 138
## 8 January 127
## 9 July 141
## 10 June 147
## 11 March 156
## 12 May 132
## 13 October 139
## 14 September 127
## c1_10y Freq
## 1 325
## 2 1995 1
## 3 1996 3
## 4 1997 19
## 5 1998 69
## 6 1999 59
## 7 2000 59
## 8 2001 60
## 9 2002 63
## 10 2003 70
## 11 2004 99
## 12 2005 87
## 13 2006 121
## 14 2007 215
## 15 2008 432
## 16 2009 362
## 17 2010 16
## 18 Don't Know 4
## c1_11 Freq
## 1 125
## 2 Alive 1936
## 3 Dead 3
## c1_12 Freq
## 1 125
## 2 Don't Know 18
## 3 No 22
## 4 Yes 1899
## c1_13 Freq
## 1 125
## 2 Don't Know 15
## 3 No 3
## 4 Refused to Answer 1
## 5 Yes 1920
## c1_14 Freq
## 1 126
## 2 Don't Know 7
## 3 No 2
## 4 Yes 1929
## c1_15 Freq
## 1 4
## 2 No 2060
## c1_16 Freq
## 1 No 2064
## c1_17 Freq
## 1 No 2064
## c1_18 Freq
## 1 No 2064
## c1_19_1 Freq
## 1 No 2064
## c1_19_2 Freq
## 1 No 2064
## c1_19_3 Freq
## 1 No 2064
## c1_19_4a Freq
## 1 No 2064
## c1_19_5 Freq
## 1 No 2064
## c1_19_6 Freq
## 1 No 2064
## c1_22a Freq
## 1 Don't Know 6
## 2 Home 137
## 3 Hospital 1708
## 4 On Route to Health Facility 65
## 5 Other 141
## 6 Other Health Facility 7
## c1_24 Freq
## 1 Don't Know 1
## 2 Specified 2063
## c1_24d Freq
## 1 1 48
## 2 10 54
## 3 11 70
## 4 12 70
## 5 13 53
## 6 14 75
## 7 15 65
## 8 16 86
## 9 17 73
## 10 18 47
## 11 19 58
## 12 2 74
## 13 20 62
## 14 21 67
## 15 22 64
## 16 23 70
## 17 24 55
## 18 25 55
## 19 26 71
## 20 27 49
## 21 28 73
## 22 29 44
## 23 3 56
## 24 30 57
## 25 31 30
## 26 4 60
## 27 5 77
## 28 6 71
## 29 7 62
## 30 8 69
## 31 9 65
## 32 Don't Know 134
## c1_24m Freq
## 1 November 189
## 2 April 156
## 3 August 201
## 4 December 161
## 5 Don't Know 129
## 6 February 144
## 7 January 135
## 8 July 153
## 9 June 114
## 10 March 183
## 11 May 131
## 12 October 188
## 13 September 180
## c1_24y Freq
## 1 1999 1
## 2 2001 1
## 3 2005 1
## 4 2007 42
## 5 2008 371
## 6 2009 1340
## 7 2010 180
## 8 Don't Know 128
## c1_26 Freq
## 1 28 days to 11 years 2064
## c4_01 Freq
## 1 Don't Know 2
## 2 No 770
## 3 Yes 1292
## c4_03 Freq
## 1 Don't Know 9
## 2 No 1053
## 3 Yes 1002
## c4_04 Freq
## 1 Don't Know 1067
## 2 Mild 50
## 3 Moderate 314
## 4 Severe 633
## c4_05 Freq
## 1 Continuous 584
## 2 Don't Know 1068
## 3 On and Off 388
## 4 Only at Night 24
## c4_06 Freq
## 1 Don't Know 15
## 2 No 1404
## 3 Yes 645
## c4_07a Freq
## 1 Don't Know 1446
## 2 Specified 618
## c4_09 Freq
## 1 Don't Know 3
## 2 No 1799
## 3 Yes 262
## c4_11 Freq
## 1 Don't Know 3
## 2 No 1997
## 3 Refused to Answer 1
## 4 Yes 63
## c4_12 Freq
## 1 No 1504
## 2 Refused to Answer 3
## 3 Yes 557
## c4_14 Freq
## 1 No 1826
## 2 Yes 238
## c4_15 Freq
## 1 Don't Know 2
## 2 No 1874
## 3 Yes 188
## c4_16 Freq
## 1 Don't Know 9
## 2 No 708
## 3 Yes 1347
## c4_18 Freq
## 1 Don't Know 35
## 2 No 1177
## 3 Refused to Answer 1
## 4 Yes 851
## c4_20 Freq
## 1 Don't Know 25
## 2 No 1406
## 3 Yes 633
## c4_22 Freq
## 1 Don't Know 27
## 2 No 1701
## 3 Yes 336
## c4_23 Freq
## 1 Don't Know 28
## 2 No 1597
## 3 Yes 439
## c4_24 Freq
## 1 Don't Know 25
## 2 No 1761
## 3 Refused to Answer 2
## 4 Yes 276
## c4_25 Freq
## 1 Don't Know 5
## 2 No 1489
## 3 Yes 570
## c4_26 Freq
## 1 Don't Know 25
## 2 No 1398
## 3 Yes 641
## c4_27 Freq
## 1 <6 hours 291
## 2 24 hours or more 201
## 3 6-23 hours 140
## 4 Don't Know 1432
## c4_28 Freq
## 1 Don't Know 18
## 2 No 1841
## 3 Yes 205
## c4_29 Freq
## 1 Don't Know 26
## 2 No 1956
## 3 Yes 82
## c4_30 Freq
## 1 Don't Know 3
## 2 No 1893
## 3 Yes 168
## c4_31_1 Freq
## 1 Don't Know 1902
## 2 Everywhere 85
## 3 Extremities 26
## 4 Face 18
## 5 Other 1
## 6 Trunk 32
## c4_31_2 Freq
## 1 Don't Know 2059
## 2 Extremities 4
## 3 Trunk 1
## c4_32 Freq
## 1 Don't Know 1912
## 2 Everywhere 21
## 3 Extremities 41
## 4 Face 46
## 5 Other 1
## 6 Trunk 43
## c4_34 Freq
## 1 Don't Know 2
## 2 No 2028
## 3 Yes 34
## c4_35 Freq
## 1 Don't Know 4
## 2 No 1803
## 3 Yes 257
## c4_36 Freq
## 1 Don't Know 2
## 2 No 1781
## 3 Yes 281
## c4_38 Freq
## 1 Don't Know 6
## 2 No 1938
## 3 Refused to Answer 2
## 4 Yes 118
## c4_39 Freq
## 1 Don't Know 2
## 2 No 1991
## 3 Yes 71
## c4_40 Freq
## 1 Don't Know 4
## 2 No 1642
## 3 Refused to Answer 1
## 4 Yes 417
## c4_41 Freq
## 1 Don't Know 22
## 2 No 1273
## 3 Refused to Answer 1
## 4 Yes 768
## c4_42 Freq
## 1 Don't Know 20
## 2 No 2009
## 3 Yes 35
## c4_43 Freq
## 1 Don't Know 41
## 2 No 1847
## 3 Yes 176
## c4_44 Freq
## 1 Don't Know 3
## 2 No 1770
## 3 Yes 291
## c4_45
## 1
## 2 2
## 3 Accident ke bad nak kan i mooh se blood baha tha
## 4 adha gala kata, cenay per
## 5 after accident blooding from mouth and nose.
## 6 all body
## 7 ANUS, NOSE & MOUTH
## 8 AT THE SIDE KAY GIBUTANGAN UG HOSE PARA MAKUHA ANG TUBIG
## 9 Bache ke honth se raktsrav hua tha.
## 10 BACK SIDE HEAD
## 11 BLED FROM MOUTH AND NOSE
## 12 BLED FROM NOSE.
## 13 BLEED FROM NOSE
## 14 BLEED FROM URINE.
## 15 BLEED IN HEAD
## 16 BLEED IN MOUTH
## 17 BLEED IN MOUTH AND NOSE.
## 18 BLEED IN NOSE
## 19 BLEED IN NOSE AND MOUTH
## 20 BLEED IN URINE
## 21 BLEED VOMITNG
## 22 BLEED VOMITNGS.
## 23 bleeding from head
## 24 Bleeding from nose after pushing chest
## 25 Bleeding from where he peed (penis)
## 26 BLEEDING IN MOUTH AND IN HEAD
## 27 Bleeding in potty
## 28 blodding from head cuse of force on head
## 29 blood from head after accident
## 30 BLOOD IN THE URINE
## 31 Blood Vomitting
## 32 BLOOD WOMTINGS
## 33 Blooding from head after accident cause of deep injury in head
## 34 Blooding from nose
## 35 Blooding with urine
## 36 BODY
## 37 BY THE MOUTH.
## 38 cause of attract with scissor blooding from stomach. Its blooding is enough.
## 39 CHEST, LEGS AND HANDS
## 40 CORD BLEEDING
## 41 Dabne ke karan jaangh par lohe ki keel se khoon nikla tha
## 42 Death se kuch ghante pahle nak se
## 43 death se pahle muh se khoon aa gaya tha.
## 44 Deevar girnay kay karar naak say ractstrava howa tha.
## 45 Deevar may dabnay kay karagh kaan va Naak say Ract estrava howa tha
## 46 EAR
## 47 EAR, HEAD.
## 48 ears
## 49 EARS, NOSE, MOUTH
## 50 EXCRETED BLOODY STOOL (ANUS)
## 51 EYER, MOUTH
## 52 FACE, CHEST
## 53 face, Neck
## 54 Face, Sar
## 55 FACE, STUMACK
## 56 FOREHEAD
## 57 foreskin
## 58 Form back side of head
## 59 From ear and throat
## 60 From Head
## 61 FROM HEAD
## 62 From head (Sar ke bal banane ke bad)
## 63 From Head and Anus
## 64 From head and dhadh
## 65 From head and hand
## 66 From Head, nose, mouth
## 67 FROM HER MOUTH
## 68 FROM HER OROPHARENGEAL TUBE, BLOOD CAME OUT
## 69 FROM HIS TUBE (NGT)
## 70 from left arm
## 71 from mouth
## 72 From mouth
## 73 From Mouth
## 74 FROM MOUTH
## 75 From mouth and nose
## 76 FROM NAILS
## 77 From Nake
## 78 from nose
## 79 from Nose
## 80 From Nose
## 81 From Nose (At the time of putting pipe in nose)
## 82 From Nose , Ear & Mouth
## 83 from nose and mouth
## 84 from Nose because of nose pipe
## 85 From nose, ear, mouth
## 86 From stomach
## 87 from temple
## 88 from the mouth
## 89 From the mouth because of a lack of ability of the blood to clot (because he/she had a probe/catheter)
## 90 Gala say
## 91 Galay say, pate say
## 92 GUMS
## 93 HAND
## 94 HANDS AND LEGS
## 95 Hath par sap ke katne se
## 96 HEAD
## 97 HEAD - VERY LITTLE AMOUNT
## 98 HEAD (FROM HIS WOUND)
## 99 Head ke pichle hisse se ,nak mooh se
## 100 HEAD RIGHT SIDE(KANITHA)
## 101 HEAD, NOSE, EAR
## 102 heart, head
## 103 IN HER MOUTHE WHEN SHE WAS SUCTIONED.
## 104 Kaan,naak Kanpati say chaku marnay kay karar Ractstrava howa tha
## 105 KICHWANI
## 106 KWENYE CHOO KIKUBWA
## 107 KWENYE ULIMI
## 108 Left arm, nose, right foot, mouth
## 109 LEFT EAR
## 110 LEFT HAND
## 111 LEG
## 112 LIPS
## 113 MalMutra
## 114 masoodhe se
## 115 MDOMONI
## 116 MGUUNI NA MKONONI
## 117 Mounth & Latrin
## 118 mouth
## 119 Mouth
## 120 MOUTH
## 121 Mouth and Nose
## 122 MOUTH AND NOSE
## 123 MOUTH CUT NOSE
## 124 MOUTH, ANUS, NOSE
## 125 Mouth, Neck
## 126 Mouth, Neck, ear , up the leg , Ghutno saya
## 127 MOUTH, NOSE
## 128 MOUTH, NOSE, EAR.
## 129 MOUTH,NOCE, EAR
## 130 MOUTH; NOSE
## 131 naak say khun
## 132 NECK
## 133 NOCE, MOUTH,ANUS
## 134 NOLE
## 135 nose
## 136 Nose
## 137 NOSE
## 138 NOSE - GAMAY RA DAW
## 139 NOSE & MOUTH
## 140 Nose and mouth
## 141 Nose and Mouth
## 142 NOSE AND MOUTH
## 143 NOSE MOUTH
## 144 NOSE WHEN SUCTIONED
## 145 Nose, ear
## 146 NOSE, EAR, MOUTH
## 147 NOSE, MOUTH
## 148 NOSE, MOUTH FROM STAMAC BLOOD WAS BLEEDING.
## 149 NOSE, MOUTH.
## 150 on head
## 151 On Head
## 152 ONE LAST DAY URINATED BLOOD FROM PENIS (BLOOD URINE)
## 153 only neck say Balgum kay Sath
## 154 paav kay Anguthay say
## 155 PAAV may
## 156 PUANI
## 157 PUANI MDOMONI KUHARA DAMU
## 158 Pure sarir may
## 159 rectum which had a fisure
## 160 RIGHT LEG
## 161 RIGHT UPPER EXTERMETY, BOTH LOWER EXTERMETY (DECEASED PHOTO ATTACHED)
## 162 Saans band hone par nak se raktshrav hua tha tab pamp dvara saans ke lene me sahayta ki gai
## 163 Saap kay katanay kay baad naak say ractstrava howa tha
## 164 Sanp katne ke bad nak se halka raktshrav hua tha
## 165 Sanp ne dahine pair ki ungli me kata tha katne ke paschat ungli se raktshrav hua tha
## 166 Sar , Jaghan
## 167 sar say
## 168 Sar say Sinay Pat say,
## 169 SEHEMU YA HAJA KUBWA
## 170 Seir
## 171 Seir may chote laganay say RactStrava howa tha
## 172 Seir say , Kamer kay nechey
## 173 seirsay, sinaysay bahay say, jangh say pate per
## 174 sharir kay vibhin Hisso say (wankay say Hatya)
## 175 SOMEWHERE IN THE STOMACH
## 176 STOMACH
## 177 stomach where the operation was
## 178 Tanduya kay hamlay kay baad sar , galy say ractstrav huwa tha.
## 179 Tatti ke sath
## 180 TUMBONI
## 181 Ulti
## 182 URINE
## 183 URINE WITH BLOOD (FRESH)
## 184 VAGINAL
## 185 VEGINAL BLEEDING ( BECAUSE OF HURT IN THE LOWER ABDOMEN.
## 186 Visfoot kay uprant poray sarir say ractstrava howa tha
## 187 VOMITS OUT BLOOD (MOUTH)
## 188 with Black Potty
## Freq
## 1 1777
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## 11 1
## 12 1
## 13 2
## 14 1
## 15 1
## 16 5
## 17 1
## 18 2
## 19 1
## 20 1
## 21 1
## 22 1
## 23 1
## 24 1
## 25 1
## 26 1
## 27 1
## 28 1
## 29 1
## 30 1
## 31 1
## 32 1
## 33 1
## 34 2
## 35 1
## 36 1
## 37 1
## 38 1
## 39 1
## 40 1
## 41 1
## 42 1
## 43 1
## 44 1
## 45 1
## 46 1
## 47 1
## 48 1
## 49 1
## 50 1
## 51 1
## 52 1
## 53 1
## 54 1
## 55 1
## 56 1
## 57 1
## 58 1
## 59 1
## 60 2
## 61 2
## 62 1
## 63 1
## 64 1
## 65 1
## 66 1
## 67 1
## 68 1
## 69 1
## 70 1
## 71 3
## 72 2
## 73 2
## 74 1
## 75 2
## 76 1
## 77 2
## 78 3
## 79 2
## 80 2
## 81 1
## 82 1
## 83 2
## 84 1
## 85 1
## 86 2
## 87 1
## 88 1
## 89 1
## 90 1
## 91 1
## 92 1
## 93 1
## 94 1
## 95 1
## 96 11
## 97 1
## 98 1
## 99 1
## 100 1
## 101 1
## 102 1
## 103 1
## 104 1
## 105 1
## 106 1
## 107 1
## 108 1
## 109 1
## 110 1
## 111 1
## 112 1
## 113 1
## 114 1
## 115 7
## 116 1
## 117 1
## 118 3
## 119 3
## 120 24
## 121 2
## 122 2
## 123 1
## 124 1
## 125 2
## 126 1
## 127 2
## 128 1
## 129 1
## 130 1
## 131 1
## 132 1
## 133 1
## 134 1
## 135 2
## 136 4
## 137 18
## 138 1
## 139 1
## 140 1
## 141 1
## 142 7
## 143 1
## 144 1
## 145 1
## 146 1
## 147 3
## 148 1
## 149 1
## 150 1
## 151 1
## 152 1
## 153 1
## 154 1
## 155 1
## 156 1
## 157 1
## 158 1
## 159 1
## 160 1
## 161 1
## 162 1
## 163 1
## 164 1
## 165 1
## 166 1
## 167 1
## 168 1
## 169 1
## 170 2
## 171 1
## 172 1
## 173 1
## 174 1
## 175 1
## 176 1
## 177 1
## 178 1
## 179 1
## 180 1
## 181 1
## 182 3
## 183 1
## 184 1
## 185 1
## 186 1
## 187 1
## 188 1
## c4_46 Freq
## 1 Don't Know 4
## 2 No 1879
## 3 Refused to Answer 1
## 4 Yes 180
## c4_47_1 Freq
## 1 No 1972
## 2 Yes 92
## c4_47_2 Freq
## 1 No 1985
## 2 Yes 79
## c4_47_3 Freq
## 1 No 1985
## 2 Yes 79
## c4_47_4 Freq
## 1 No 2045
## 2 Yes 19
## c4_47_5 Freq
## 1 No 2004
## 2 Yes 60
## c4_47_6 Freq
## 1 No 2007
## 2 Yes 57
## c4_47_7 Freq
## 1 No 2010
## 2 Yes 54
## c4_47_8a Freq
## 1 No 1996
## 2 Yes 68
## c4_47_8b Freq
## 1 1990
## 2 ACCEDENTAL ELECTRIC SHOCK 1
## 3 ACCIDENTAL ELECTRICAL BURN 1
## 4 ACCIDENTAL ELECTRICS SHOCK 1
## 5 BLOCK FALLS 1
## 6 BOIL WATER 1
## 7 BOIL WATER FALL DOWN ON BODY 1
## 8 BOMB BLAST 2
## 9 Bum Blast 1
## 10 BURN FROM HOT WATER 1
## 11 BURNED BY HOT WATER 1
## 12 Chakkar Khakar Gir Gaya Tha 1
## 13 Chapper Girne Se 1
## 14 CURENT SHOCK 1
## 15 CURRENT SHOCK 2
## 16 DAL FALL DOWN ON THE BODY 1
## 17 Dale Me Dab Gaya Tha. 1
## 18 Death From Fall Wall 1
## 19 Deevar Gornay Say Dubkar Mrathu 1
## 20 Deevar Kay Nechay Dabnay Say Mratthu 1
## 21 Deewar Se Niche Dabkar 1
## 22 Divar Girne Par Usme Dabkar. 1
## 23 Diwar Me Dabkar 1
## 24 DOG'S BITE. 1
## 25 ELECTIRC SHOCK 1
## 26 ELECTRIC SHOCK 3
## 27 ELECTRICAL SHOCK 2
## 28 FALL DOWN STONE 1
## 29 FALL DOWN WALL 1
## 30 FALLEN OF WALL ON BUDY 1
## 31 Falling Wall 2
## 32 FELL DOWN GATE 1
## 33 GATE FELL DOWN ON CHILD BODY 1
## 34 Ghar Girne Se Death Hui 1
## 35 HAPANA 2
## 36 HAPANA 1
## 37 HEAD HIT BY A FALLING TREE 1
## 38 HEAD INJURY 3
## 39 HIT BY A COCONUT 1
## 40 HOT WATER 1
## 41 HOT WATER BURNS 1
## 42 HOT WATER FALLON ON BABY 1
## 43 INJURY TO GROIN 1
## 44 KAKANYANGA KICHOMVI 1
## 45 Karant Laga 1
## 46 KUANGUKIWA NA MNAZI 1
## 47 LAND FALLS 1
## 48 LEG INJURY 1
## 49 Mitti Se Niche Dabkar 3
## 50 PRICK OF NAIL 1
## 51 ROAD TRAFFIC ACCIDENT 1
## 52 Self Shouting Mistake 1
## 53 SINKING IN THE TOILET 1
## 54 SPILLING OF HOT WATER 1
## 55 SWALOD IRON TABLETS 1
## 56 SWELLINGS 1
## 57 T.V FELLED ON FACE 1
## 58 T.V.FALL DOWN ON HEAD 1
## 59 T.V.FALL DOWN ON THE HEAD 1
## 60 TREE BRANCH FELL ON HIM 1
## 61 Tv Fall On Head 1
## 62 Under Wall 2
## 63 UVIMBE WA SINDANO 1
## c4_47_9 Freq
## 1 No 2040
## 2 Yes 24
## c4_47_10 Freq
## 1 No 2035
## 2 Yes 29
## c4_47_11 Freq
## 1 No 1511
## 2 Yes 553
## c4_48 Freq
## 1 Don't Know 51
## 2 No 1938
## 3 Yes 75
## c5_01 Freq
## 1 No 281
## 2 Yes 1783
## c5_02_1 Freq
## 1 No 2009
## 2 Yes 55
## c5_02_2 Freq
## 1 No 2001
## 2 Yes 63
## c5_02_3 Freq
## 1 No 2055
## 2 Yes 9
## c5_02_4 Freq
## 1 No 347
## 2 Yes 1717
## c5_02_5 Freq
## 1 No 1791
## 2 Yes 273
## c5_02_6 Freq
## 1 No 1864
## 2 Yes 200
## c5_02_7 Freq
## 1 No 2032
## 2 Yes 32
## c5_02_8 Freq
## 1 No 2061
## 2 Yes 3
## c5_02_9 Freq
## 1 No 1582
## 2 Yes 482
## c5_02_10 Freq
## 1 No 2043
## 2 Yes 21
## c5_02_11a Freq
## 1 No 2046
## 2 Yes 18
## c5_02_12 Freq
## 1 No 2063
## 2 Yes 1
## c5_02_13 Freq
## 1 No 2064
## c5_02_14 Freq
## 1 No 2064
## c5_04 Freq
## 1 282
## 2 Don't Know 4
## 3 No 1544
## 4 Refused to Answer 1
## 5 Yes 233
## c5_05 Freq
## 1 No 1972
## 2 Yes 92
## c5_06_1d Freq
## 1 1 2
## 2 10 4
## 3 11 4
## 4 12 2
## 5 13 3
## 6 14 4
## 7 15 3
## 8 16 2
## 9 17 1
## 10 18 2
## 11 19 4
## 12 2 3
## 13 20 4
## 14 21 1
## 15 22 2
## 16 23 3
## 17 24 3
## 18 25 1
## 19 26 5
## 20 27 4
## 21 28 3
## 22 29 2
## 23 3 2
## 24 31 1
## 25 4 4
## 26 5 2
## 27 6 3
## 28 7 3
## 29 8 7
## 30 9 4
## 31 Don't Know 1976
## c5_06_1m Freq
## 1 November 11
## 2 April 4
## 3 August 7
## 4 December 4
## 5 Don't Know 1976
## 6 February 11
## 7 January 6
## 8 July 11
## 9 June 2
## 10 March 6
## 11 May 5
## 12 October 8
## 13 September 13
## c5_06_1y Freq
## 1 2006 1
## 2 2007 5
## 3 2008 14
## 4 2009 58
## 5 2010 10
## 6 Don't Know 1976
## c5_06_2d Freq
## 1 10 3
## 2 11 3
## 3 12 2
## 4 14 2
## 5 17 1
## 6 18 1
## 7 20 1
## 8 21 2
## 9 22 1
## 10 23 3
## 11 24 2
## 12 25 2
## 13 26 2
## 14 27 1
## 15 28 6
## 16 3 5
## 17 31 2
## 18 4 2
## 19 5 1
## 20 7 2
## 21 8 2
## 22 Don't Know 2018
## c5_06_2m Freq
## 1 November 6
## 2 April 3
## 3 August 4
## 4 December 4
## 5 Don't Know 2018
## 6 February 4
## 7 January 5
## 8 July 2
## 9 June 3
## 10 March 8
## 11 May 1
## 12 October 4
## 13 September 2
## c5_06_2y Freq
## 1 2007 1
## 2 2008 7
## 3 2009 29
## 4 2010 7
## 5 Don't Know 2020
## c5_08d Freq
## 1 1 1
## 2 10 3
## 3 11 4
## 4 12 2
## 5 13 1
## 6 14 6
## 7 15 3
## 8 17 1
## 9 18 1
## 10 19 3
## 11 2 1
## 12 20 1
## 13 21 2
## 14 22 1
## 15 23 4
## 16 24 3
## 17 25 1
## 18 26 5
## 19 27 1
## 20 28 6
## 21 3 9
## 22 30 1
## 23 31 1
## 24 4 4
## 25 5 3
## 26 7 4
## 27 8 5
## 28 9 3
## 29 Don't Know 1984
## c5_08m Freq
## 1 November 7
## 2 April 5
## 3 August 9
## 4 December 4
## 5 Don't Know 1984
## 6 February 11
## 7 January 5
## 8 July 7
## 9 June 5
## 10 March 8
## 11 May 2
## 12 October 8
## 13 September 9
## c5_08y Freq
## 1 2007 3
## 2 2008 12
## 3 2009 55
## 4 2010 10
## 5 Don't Know 1984
## c5_10 Freq
## 1 108
## 2 Don't Know 45
## 3 No 858
## 4 Yes 1053
## c5_11 Freq
## 1 No 1758
## 2 Yes 306
## c5_17 Freq
## 1 104
## 2 Don't Know 184
## 3 No 942
## 4 Refused to Answer 4
## 5 Yes 830
## c5_18 Freq
## 1 Don't Know 17
## 2 No 2020
## 3 Refused to Answer 2
## 4 Yes 25
## c5_19 Freq
## 1 236
## 2 Don't Know 140
## 3 No 1654
## 4 Refused to Answer 5
## 5 Yes 29
Columns further need to be removed
module: remove (same value for all observations)
c1_02: remove, as only applies to multiple births (info is covered by c1_01 already)
c1_04: remove, as it only applies to cases where moms were dead (and info is covered in c1_03) c1_05: remove, as only applies to cases where moms were dead c1_08a: remove since only showing units (may refer to c1_08b for detailed values)
c1_10,c1_10d,c1_10m,c1_10y: remove, duplicated with g5_01 c1_11: remove, since we only want to focus on children who shouldn’t have been dead at birth
c1_15: remove, since there’s only one class
c1_16: remove, since there’s only one class
c1_17: remove, since there’s only one class
c1_18: remove, since there’s only one class
c1_19_1 to c1_19_6: remove, since there’s only one class
c1_24: remove, since only showing units
c1_24d,c1_24m,c1_24y: remove, duplicated with g5_03
c1_26: remove, since there’s only one class
c4_07a: remove, since only showing units
c4_31_1: remove, since only applies to those with rash (information already there in c4_30)
c4_31_2: remove, as only applies to the ones who developed rash (info already covered in c4_30)
c4_32: remove, as only applies to the ones who developed rash (info already covered in c4_30)
c4_45: remove, since most of the value missing
c4_47_8b: remove, since most of the value missing
c5_02_13: remove, since there’s only one class
c5_02_14: remove, since there’s only one class c5_06_2m,c5_06_2d,c5_06_2y: remove, most of the data is don’t know/missing c5_08m,c5_08d,c5_08y: remove, most of the data is don’t know/missing
df_va <- subset(df_va,select=-c(c1_02,c1_04,c1_05,c1_08a,c1_10,c1_10d,c1_10m,c1_10y,c1_11,c1_15,c1_16,c1_17,c1_18,c1_19_1,c1_19_2,c1_19_3,c1_19_4a,c1_19_4b,c1_19_5,c1_19_6,c1_24,c1_24d,c1_24m,c1_24y,c1_26,c4_07a,c4_31_1,c4_31_2,c4_32,c4_45,c4_47_8b,c5_02_13,c5_02_14,c5_06_2m,c5_06_2d,c5_06_2y,c5_08m,c5_08d,c5_08y))
Columns need further processed
g5_01d,g5_01m,g5_01y: compute DOB and then remove
g5_03d,g5_03m,g5_03y: compute DOD and then remove
# Compute DOB
df_va$DOB_month <- match(df_va$g5_01m,month.name)
df_va$DOB_str <- paste(df_va$g5_01y,"-",df_va$DOB_month,"-",df_va$g5_01d)
df_va$DOB <- as.POSIXct(df_va$DOB_str, format="%Y - %m - %d",tz="UTC")
df_va <- subset(df_va,select=-c(DOB_str,DOB_month))
# Compute DOD
df_va$DOD_month <- match(df_va$g5_03m,month.name)
df_va$DOD_str <- paste(df_va$g5_03y,"-",df_va$DOD_month,"-",df_va$g5_03d)
df_va$DOD <- as.POSIXct(df_va$DOD_str, format="%Y - %m - %d",tz="UTC")
df_va <- subset(df_va,select=-c(DOD_str,DOD_month))
df_va <- subset(df_va,select=-c(g5_01d,g5_01m,g5_01y,g5_03d,g5_03m,g5_03y))
c1_22a: combine “hospital” with “other health facility” to “Health facility”
df_va$c1_22a <- as.character(df_va$c1_22a)
df_va$c1_22a[df_va$c1_22a=="Hospital"] <- "Health Facility"
df_va$c1_22a[df_va$c1_22a=="Other Health Facility"] <- "Health Facility"
df_va$c1_22a <- as.factor(df_va$c1_22a)
c5_06_1d,c5_06_1m,c5_06_1y: remove, as most of the records are missing
df_va <- subset(df_va,select=-c(c5_06_1d,c5_06_1m,c5_06_1y))
Imputation and removal for missing values
Check missing values for numeric variables
df_va_num <- select_if(df_va,is.numeric)
summary(df_va_num)
## g4_06 g4_07 g5_04a g5_04b
## Min. : 1.000 Min. : 0.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 2.000
## Median : 5.000 Median : 2.000 Median : 4.000 Median : 5.000
## Mean : 5.225 Mean : 2.436 Mean : 4.855 Mean : 5.131
## 3rd Qu.: 6.000 3rd Qu.: 3.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :30.000 Max. :15.000 Max. :19.000 Max. :12.000
## NA's :1 NA's :1 NA's :792 NA's :1280
## g5_04c c1_08b c1_20 c1_21
## Min. :28.0 Min. : 2 Min. : 0.0 Min. : 0.00
## 1st Qu.:28.0 1st Qu.:2500 1st Qu.: 0.0 1st Qu.: 1.00
## Median :29.0 Median :2800 Median : 90.0 Median : 7.00
## Mean :28.6 Mean :2798 Mean : 564.4 Mean : 37.15
## 3rd Qu.:29.0 3rd Qu.:3100 3rd Qu.: 365.0 3rd Qu.: 20.00
## Max. :29.0 Max. :9999 Max. :4015.0 Max. :4015.00
## NA's :2059 NA's :485
## c1_25 c4_02 c4_07b c4_08
## Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 90.0 Median : 2.000 Median : 0.000 Median : 0.000
## Mean : 559.2 Mean : 5.587 Mean : 1.839 Mean : 1.701
## 3rd Qu.: 365.0 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 0.000
## Max. :4015.0 Max. :240.000 Max. :30.000 Max. :98.000
##
## c4_10 c4_13 c4_17 c4_19
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.0000 Median : 0.000 Median : 1.000 Median : 0.000
## Mean : 0.3203 Mean : 3.708 Mean : 5.487 Mean : 2.755
## 3rd Qu.: 0.0000 3rd Qu.: 1.000 3rd Qu.: 4.000 3rd Qu.: 1.000
## Max. :30.0000 Max. :300.000 Max. :740.000 Max. :740.000
##
## c4_33 c4_37 c4_49 c5_07_1
## Min. : 0.0000 Min. : 0.0000 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.0
## Median : 0.0000 Median : 0.0000 Median : 0.0000 Median : 0.0
## Mean : 0.6192 Mean : 0.9525 Mean : 0.8075 Mean : 257.1
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.0
## Max. :98.0000 Max. :98.0000 Max. :150.0000 Max. :12000.0
##
## c5_07_2
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 0.0
## Mean : 235.9
## 3rd Qu.: 0.0
## Max. :13200.0
##
Remove c5_07_1 and c5_07_1 since most of them are 0 (missing in this case)
df_va <- subset(df_va,select=-c(c5_07_1,c5_07_2))
c1_08b: re-code “9999” as NA
df_va$c1_08b[df_va$c1_08b==9999] <- NA
Drop g5_04a,g5_04b,g5_04c and use DOB and DOD to compute the age of death (in years)
df_va <- subset(df_va,select=-c(g5_04a,g5_04b,g5_04c))
df_va$age_death <- as.numeric(difftime(df_va$DOD, df_va$DOB, units = "days"))/365
library(naniar)
vis_miss(df_va_num)
split training and test set
library(caTools)
set.seed(123)
split = sample.split(df_va$site, SplitRatio = 0.7)
training_set = subset(df_va, split == TRUE)
test_set = subset(df_va, split == FALSE)
Deal with “Don’t know”, “Refuse to answer” and original missing value for all the categorical variables
1) If the sum of counts of all of them three is >=10: re-code all of them with “Don’t know” as a group itself
2) If the sum of counts of all of them three is <10: re-code all of them as missing value
# Define function impute_cat
impute_cat <- function(df_va){
cat_num=c()
for (i in 1:ncol(df_va)){
if(is.factor(df_va[,i])){
cat_num <- append(cat_num,i)} # Compute the positions of the columns where the variable is categorical
}
for (i in cat_num){
count_dk <- sum(df_va[,i]=="Don't Know",na.rm=TRUE)
count_rta <- sum(df_va[,i]=="Refused to Answer",na.rm=TRUE)
count_missing <- sum(is.na(df_va[,i]),na.rm=FALSE)
count_whitespace <- sum(df_va[,i]=="",na.rm=TRUE)
count_all <- count_dk+count_rta+count_missing+count_whitespace
df_va[,i] <- as.character(df_va[,i])
if (count_all==10||count_all>15){
if (count_rta>0){
df_va[,i][df_va[,i]=="Refused to Answer"] <- "Don't Know"
}
if (count_whitespace>0){
df_va[,i][df_va[,i]==""] <- "Don't Know"
}
if (count_missing>0){
df_va[,i][is.na(df_va[,i])] <- "Don't Know"
}
}else{
if (count_rta>0){
df_va[,i][df_va[,i]=="Refused to Answer"] <- NA
}
if (count_dk>0){
df_va[,i][df_va[,i]=="Don't Know"] <- NA
}
if (count_whitespace>0){
df_va[,i][df_va[,i]==""] <- NA
}
}
df_va[,i] <- as.factor(df_va[,i])
}
return(df_va)
}
# apply impute_cat on both training set and test set
training_set <- impute_cat(training_set)
test_set <- impute_cat(test_set)
Impute missing values of c1_08b with mean of the data by different site groups (fit on training set and transform both sets)
tapply(training_set$c1_08b,training_set$site, mean, na.rm=TRUE)
## AP Bohol Dar Mexico Pemba UP
## 2710.312 2971.918 2963.937 2554.524 3058.375 2715.539
correct_weight <- function(x1, x2){
if(is.na(x1)){
if(x2=="AP"){
return(2710)
}
else if(x2=="Bohol"){
return(2972)
}
else if (x2=="Dar"){
return(2964)
}
else if (x2=="Mexico"){
return(2555)
}
else if (x2=="Pemba"){
return(3058)
}
else{return(2716)}
}else{
return(x1)}
}
training_set$c1_08b <- apply(training_set[,c("c1_08b","site")], 1, function(x) correct_weight(x[1],x[2]))
test_set$c1_08b <- apply(test_set[,c("c1_08b","site")], 1, function(x) correct_weight(x[1],x[2]))
Remove all the remaining missing data for both sets
training_set <- training_set[complete.cases(training_set),]
test_set <- test_set[complete.cases(test_set),]
Visualization for EDA
# Change colname names for training_set_copy
library(ggpubr)
library(plyr)
training_set_copy <- training_set
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("g4_06","g4_07","g4_08"), to=c("num_people_live_at_address","num_rooms_in_household","separate_room_for_cooking"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_09","c1_01","c1_03"), to=c("Gender","Singleton_or_Multiple_Birth","Mother_Living_or_Deceased"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_06a","c1_07","c1_08b","c1_12","c1_13"), to=c("Location_of_Birth","Size_at_Birth","Weight_at_Birth","Did_the_Baby_Cry","Did_the_Baby_Move"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("num_people_live_at_address","num_rooms_in_household","separate_room_for_cooking"), to=c("Num_People_Live_at_Address","Num_Rooms_in_Household","Separate_Room_for_Cooking"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_14","c1_20","c1_21","c1_22a","c1_25","c4_01","c4_02","c4_03"), to=c("Did_the_Baby_Breathe","Age_at_Onset_of_Illness","Duration_of_Illness", "Location_of_Death","Age_at_Time_of_Death","Fever_During_Illness","Duration_of_Fever_in_Days","Did_the_Fever_Continue_to_Death"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_04","c4_05","c4_06","c4_07b","c4_08","c4_09","c4_10","c4_11","c4_12","c4_13","c4_14","c4_15","c4_16","c4_17","c4_18", "c4_19"), to=c("Severity_of_Fever","Fever_Pattern","Loose_Liquid_Stool", "Highest_Num_Loose_Stool_per_Day_During_Illness","Num_Days_Before_Death_Loose_Stool_Began","Loose_Stool_Cont_Until_Death","Num_Days_Before_Death_Loose_Stool_Stopped","Blood_in_Stool", "Cough_During_Illness", "Duration_of_Cough", "Severity_of_Cough", "Vomitus_after_Coughing", "Difficulty_Breathing", "Duration_of_Difficulty_Breathing", "Fast_Breathing", "Duration_of_Fast_Breathing"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_20", "c4_22", "c4_23", "c4_24", "c4_25", "c4_26","c4_27", "c4_28", "c4_29", "c4_30", "c4_33", "c4_34", "c4_35", "c4_36", "c4_37", "c4_38", "c4_39","c4_40", "c4_41", "c4_42"), to=c("Indrawing_of_Chest", "Breathing_Stridor", "Breathing_Grunting", "Breathing_Wheezing", "Convulsions", "Loss_of_Consciousness", "Duration_Before_Death_LOC_Occurred", "Stiff_Neck", "Bulging_Fontanelle", "Skin_Rash", "Duration_of_Rash", "Blisters_Present_in_Rash", "Limbs_Become_Thin", "Swollen_Legs_or_Feet", "Duration_of_Swelling", "Skin_Flake_Off_in_Patches", "Hair_Color_Change_to_Red_Yellow", "Protruding_Belly", "Pallor_or_Lack_of_Blood", "Swelling_in_Armpits"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_43","c4_44","c4_46", "c4_47_1","c4_47_2", "c4_47_3", "c4_47_4", "c4_47_5","c4_47_6","c4_47_7","c4_47_8a","c4_47_9","c4_47_10","c4_47_11"), to=c("Whitish_Rash_in_Mouth", "Bleeding_Seen","Skin_Turned_Black", "Suffered_Road_Traffic_Injury", "Suffered_a_Fall", "Suffered_Drowning", "Suffered_Poisoning", "Suffered_Bite_Sting", "Suffered_Burn_Fire", "Victim_of_Violence", "Other_Injury", "Unsure_if_Injury_Occurred","Refused_to_Answer_if_Deceased_Suffered_Injury", "Did_Not_Suffer_Injury" ))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_48","c4_49","c5_01","c5_02_1","c5_02_2","c5_02_3","c5_02_4","c5_02_5","c5_02_6","c5_02_7","c5_02_8","c5_02_9", "c5_02_10", "c5_02_11a","c5_02_12","c5_04", "c5_05","c5_10", "c5_11", "c5_17", "c5_18", "c5_19"), to=c("Injury_Intentionally_Inflicted_by_Someone","Days_Survived_After_Injury","Sought_Care_While_Ill", "Care_Sought_Traditional_Healer", "Care_Sought_Homeopath", "Care_Sought_Religious_Leader", "Care_Sought_Governmental_Hospital", "Care_Sought_Governmental_Health_Center_Clinic", "Care_Sought_Private_Hospital", "Care_Sought_Community_Based_Practioner", "Care_Sought_Trained_Birth_Attendant", "Care_Sought_Private_Physician", "Care_Sought_Pharmacy", "Care_Sought_Other_Provider", "Care_Sought_Relative_Friend", "Health_Records_for_Deceased", "Granted_Access_to_Health_Records", "Death_Certificate_Issued", "Granted_Access_to_Death_Certificate", "Mother_Ever_Tested_for_HIV", "Mother_HIV_Positive", "Mother_AIDS_Positive"))
library(ggplot2)
Plot distribution of all the numerical variables with histogram
hist_list <- list()
z <- 0
for (i in 1:ncol(training_set_copy)){
if(is.numeric(training_set_copy[,i])){
z <- z+1
hist <- ggplot(data=training_set_copy,aes_string(names(training_set_copy)[i]))+
geom_histogram(fill='lightblue',color='black')+
labs(x=colnames(training_set_copy)[i],y= "Frequency")+
theme(axis.text=element_text(size=12),
axis.title=element_text(size=8))
hist_list[[z]] <- hist
}
}
ggarrange(plotlist=hist_list,ncol = 2)
## $`1`
##
## $`2`
##
## $`3`
##
## $`4`
##
## $`5`
##
## $`6`
##
## $`7`
##
## $`8`
##
## attr(,"class")
## [1] "list" "ggarrange"
Plot bar charts for all the categorical variables
library(scales)
bar_list <- list()
z <- 0
for (i in 1:ncol(training_set_copy)){
if(is.factor(training_set_copy[,i])){
z <- z+1
class <- training_set_copy %>%
group_by((training_set_copy)[i]) %>%
dplyr::summarise(number=n()) %>%
mutate(percentage=percent(number/sum(number))) %>%
mutate(pos = cumsum(number)- number/1.5) #Create a variable to indicate the position of labels
bar <- ggplot(data=class,aes_string(x=colnames(class)[1],y=colnames(class)[2]))+
geom_bar(fill='lightblue',color='black',stat = "identity")+
labs(x=colnames(training_set_copy)[i],y= "Count")+
geom_text(aes(label=paste(number,"(",percentage,")")),vjust=-0.5,size=2,color="red")+
theme(axis.text=element_text(size=8,angle=45,hjust=1),axis.title=element_text(size=8))
bar_list[[z]] <- bar
}
}
# Cause of death have many bins so I plot it separately
bar_list[[2]]
ggarrange(plotlist=bar_list[-2],ncol=2)
## $`1`
##
## $`2`
##
## $`3`
##
## $`4`
##
## $`5`
##
## $`6`
##
## $`7`
##
## $`8`
##
## $`9`
##
## $`10`
##
## $`11`
##
## $`12`
##
## $`13`
##
## $`14`
##
## $`15`
##
## $`16`
##
## $`17`
##
## $`18`
##
## $`19`
##
## $`20`
##
## $`21`
##
## $`22`
##
## $`23`
##
## $`24`
##
## $`25`
##
## $`26`
##
## $`27`
##
## $`28`
##
## $`29`
##
## $`30`
##
## $`31`
##
## $`32`
##
## $`33`
##
## $`34`
##
## $`35`
##
## $`36`
##
## $`37`
##
## $`38`
##
## attr(,"class")
## [1] "list" "ggarrange"
# Top 5 cause of death for the whole dataset: Pneumonia, Diarrhea/Dysentery,Other Defined Causes of Child Deaths, Sepsis, Malaria
top_5 <- c("Pneumonia","Diarrhea/Dysentery","Other Defined Causes of Child Deaths", "Sepsis", "Malaria")
training_set_copy_top5 <- training_set_copy[training_set_copy$gs_text34 %in% top_5,]
# Remove classes with 0 cases
for (i in 1:length(names(training_set_copy_top5))){
if (is.factor(training_set_copy_top5[,i])){
training_set_copy_top5[,i] <- as.character(training_set_copy_top5[,i])
training_set_copy_top5[,i] <- as.factor(training_set_copy_top5[,i])
}
}
table(training_set_copy_top5$gs_text34)
##
## Diarrhea/Dysentery Malaria
## 108 45
## Other Defined Causes of Child Deaths Pneumonia
## 78 221
## Sepsis
## 64
We may also want to see the top 5 cause of death of each area
top_5_by_site <- training_set_copy[,c("site","gs_text34")]%>%
group_by(site,gs_text34)%>%
dplyr::summarise(count_cause=n())%>%
arrange(site,desc(count_cause))%>%
filter(row_number()==1:5)
top_5_by_site
## # A tibble: 30 x 3
## # Groups: site [6]
## site gs_text34 count_cause
## <fct> <fct> <int>
## 1 AP Pneumonia 36
## 2 AP Other Defined Causes of Child Deaths 15
## 3 AP Diarrhea/Dysentery 11
## 4 AP Sepsis 10
## 5 AP Drowning 9
## 6 Bohol Pneumonia 72
## 7 Bohol Other Digestive Diseases 10
## 8 Bohol Diarrhea/Dysentery 9
## 9 Bohol Sepsis 9
## 10 Bohol Hemorrhagic fever 7
## # ... with 20 more rows
var_list <- names(training_set_copy_top5)
var_list
## [1] "site"
## [2] "gs_text34"
## [3] "Num_People_Live_at_Address"
## [4] "Num_Rooms_in_Household"
## [5] "Separate_Room_for_Cooking"
## [6] "Singleton_or_Multiple_Birth"
## [7] "Mother_Living_or_Deceased"
## [8] "Location_of_Birth"
## [9] "Size_at_Birth"
## [10] "Weight_at_Birth"
## [11] "Gender"
## [12] "Did_the_Baby_Cry"
## [13] "Did_the_Baby_Move"
## [14] "Did_the_Baby_Breathe"
## [15] "Age_at_Onset_of_Illness"
## [16] "Duration_of_Illness"
## [17] "Location_of_Death"
## [18] "Age_at_Time_of_Death"
## [19] "Fever_During_Illness"
## [20] "Duration_of_Fever_in_Days"
## [21] "Did_the_Fever_Continue_to_Death"
## [22] "Severity_of_Fever"
## [23] "Fever_Pattern"
## [24] "Loose_Liquid_Stool"
## [25] "Highest_Num_Loose_Stool_per_Day_During_Illness"
## [26] "Num_Days_Before_Death_Loose_Stool_Began"
## [27] "Loose_Stool_Cont_Until_Death"
## [28] "Num_Days_Before_Death_Loose_Stool_Stopped"
## [29] "Blood_in_Stool"
## [30] "Cough_During_Illness"
## [31] "Duration_of_Cough"
## [32] "Severity_of_Cough"
## [33] "Vomitus_after_Coughing"
## [34] "Difficulty_Breathing"
## [35] "Duration_of_Difficulty_Breathing"
## [36] "Fast_Breathing"
## [37] "Duration_of_Fast_Breathing"
## [38] "Indrawing_of_Chest"
## [39] "Breathing_Stridor"
## [40] "Breathing_Grunting"
## [41] "Breathing_Wheezing"
## [42] "Convulsions"
## [43] "Loss_of_Consciousness"
## [44] "Duration_Before_Death_LOC_Occurred"
## [45] "Stiff_Neck"
## [46] "Bulging_Fontanelle"
## [47] "Skin_Rash"
## [48] "Duration_of_Rash"
## [49] "Blisters_Present_in_Rash"
## [50] "Limbs_Become_Thin"
## [51] "Swollen_Legs_or_Feet"
## [52] "Duration_of_Swelling"
## [53] "Skin_Flake_Off_in_Patches"
## [54] "Hair_Color_Change_to_Red_Yellow"
## [55] "Protruding_Belly"
## [56] "Pallor_or_Lack_of_Blood"
## [57] "Swelling_in_Armpits"
## [58] "Whitish_Rash_in_Mouth"
## [59] "Bleeding_Seen"
## [60] "Skin_Turned_Black"
## [61] "Suffered_Road_Traffic_Injury"
## [62] "Suffered_a_Fall"
## [63] "Suffered_Drowning"
## [64] "Suffered_Poisoning"
## [65] "Suffered_Bite_Sting"
## [66] "Suffered_Burn_Fire"
## [67] "Victim_of_Violence"
## [68] "Other_Injury"
## [69] "Unsure_if_Injury_Occurred"
## [70] "Refused_to_Answer_if_Deceased_Suffered_Injury"
## [71] "Did_Not_Suffer_Injury"
## [72] "Injury_Intentionally_Inflicted_by_Someone"
## [73] "Days_Survived_After_Injury"
## [74] "Sought_Care_While_Ill"
## [75] "Care_Sought_Traditional_Healer"
## [76] "Care_Sought_Homeopath"
## [77] "Care_Sought_Religious_Leader"
## [78] "Care_Sought_Governmental_Hospital"
## [79] "Care_Sought_Governmental_Health_Center_Clinic"
## [80] "Care_Sought_Private_Hospital"
## [81] "Care_Sought_Community_Based_Practioner"
## [82] "Care_Sought_Trained_Birth_Attendant"
## [83] "Care_Sought_Private_Physician"
## [84] "Care_Sought_Pharmacy"
## [85] "Care_Sought_Other_Provider"
## [86] "Care_Sought_Relative_Friend"
## [87] "Health_Records_for_Deceased"
## [88] "Granted_Access_to_Health_Records"
## [89] "Death_Certificate_Issued"
## [90] "Granted_Access_to_Death_Certificate"
## [91] "Mother_Ever_Tested_for_HIV"
## [92] "Mother_HIV_Positive"
## [93] "Mother_AIDS_Positive"
## [94] "DOB"
## [95] "DOD"
## [96] "age_death"
Plot grouped box plots for numerical variables with top 5 cause of death (across the whole dataset)
var_list <- names(training_set_copy_top5)
grouped_box_list <- list()
z <- 0
for (i in 3:length(var_list)){
if (is.numeric(training_set_copy_top5[,i])){
z <- z+1
grouped_box <- ggplot(data=training_set_copy_top5,aes_string(x=var_list[1],y=var_list[i],fill=var_list[2]))+
geom_boxplot()+
theme(axis.text=element_text(size=15,angle=45,hjust=1),axis.title=element_text(size=15),legend.key.size=unit(2,"cm"),legend.title=element_text(size=15),legend.text=element_text(size=15))
grouped_box_list[[z]] <- grouped_box
}
}
grouped_box_list
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
Plot grouped bar charts for categorical variables with the top 5 cause of death (across the whole dataset) (I only plot for “Pneumonia”,“Diarrhea/Dysentery”,“Sepsis”)
library(dplyr)
top_5 <-c("Pneumonia","Diarrhea/Dysentery","Other Defined Causes of Child Deaths","Sepsis", "Malaria")
Compute disease prevalence by site. The number will be used to define if a given cause is more prevalent in one group than another across different sites.
cause_prev_by_site <- training_set_copy[,c("site","gs_text34")]%>%
group_by(site,gs_text34)%>%
dplyr::summarise(count_cause=n())%>%
dplyr::mutate(prevalence=count_cause/sum(count_cause))%>%
arrange(site,desc(count_cause))
cause_prev_by_site
## # A tibble: 84 x 4
## # Groups: site [6]
## site gs_text34 count_cause prevalence
## <fct> <fct> <int> <dbl>
## 1 AP Pneumonia 36 0.259
## 2 AP Other Defined Causes of Child Deaths 15 0.108
## 3 AP Diarrhea/Dysentery 11 0.0791
## 4 AP Sepsis 10 0.0719
## 5 AP Drowning 9 0.0647
## 6 AP Hemorrhagic fever 9 0.0647
## 7 AP Road Traffic 9 0.0647
## 8 AP Fires 8 0.0576
## 9 AP Other Cardiovascular Diseases 6 0.0432
## 10 AP Violent Death 5 0.0360
## # ... with 74 more rows
# Pneumonia
grouped_bar_Pneumonia <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
z <- z+1
grouped_class <- training_set_copy[training_set_copy$gs_text34=="Pneumonia",c(1,i)]%>%
group_by_all()%>%
dplyr::summarise(count=n())%>%
group_by(site)%>%
dplyr::mutate(perc=count/sum(count))
grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Pneumonia"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
grouped_bar_Pneumonia[[z]] <- grouped_bar
}
}
grouped_bar_Pneumonia
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
##
## [[69]]
##
## [[70]]
##
## [[71]]
##
## [[72]]
##
## [[73]]
##
## [[74]]
##
## [[75]]
# "Diarrhea/Dysentery"
grouped_bar_Diarrhea <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
z <- z+1
grouped_class <- training_set_copy[training_set_copy$gs_text34=="Diarrhea/Dysentery",c(1,i)]%>%
group_by_all()%>%
dplyr::summarise(count=n())%>%
group_by(site)%>%
dplyr::mutate(perc=count/sum(count))
grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Diarrhea/Dysentery"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
grouped_bar_Pneumonia[[z]] <- grouped_bar
}
}
grouped_bar_Pneumonia
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
##
## [[69]]
##
## [[70]]
##
## [[71]]
##
## [[72]]
##
## [[73]]
##
## [[74]]
##
## [[75]]
# Sepsis
grouped_bar_sepsis <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
z <- z+1
grouped_class <- training_set_copy[training_set_copy$gs_text34=="Sepsis",c(1,i)]%>%
group_by_all()%>%
dplyr::summarise(count=n())%>%
group_by(site)%>%
dplyr::mutate(perc=count/sum(count))
grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Sepsis"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
grouped_bar_sepsis[[z]] <- grouped_bar
}
}
grouped_bar_sepsis
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
##
## [[69]]
##
## [[70]]
##
## [[71]]
##
## [[72]]
##
## [[73]]
##
## [[74]]
##
## [[75]]
EDA Visualization of Gender of Deceased
gen <- ggplot(data = training_set_copy) +
geom_bar(aes(x = Gender, fill = Gender), stat = "Count")
print(gen)